from io import BytesIO
from tkinter import Image

import PyPDF2
from docx import Document


def pdf_to_word_with_ocr(pdf_path, word_path):
    doc = Document()
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            # 将PDF页面转换为图像
            image = Image.open(BytesIO(reader.pages[page.page_number-1].extract_text()))
            # 使用Tesseract进行OCR处理以提取文本
            text = pytesseract.image_to_string(image, lang='eng')  # 你可以根据需要指定语言代码
            if text:
                doc.add_paragraph(text)
    doc.save(word_path)
    print(f"Converted '{pdf_path}' to '{word_path}' with OCR")

# 使用函数
pdf_file = 'F:\海思太科\医疗保障信息平台数据归集交换库设计说明书-V3.3.pdf'
word_file = 'F:\海思太科\医疗保障信息平台数据归集交换库设计说明书-V3.31111222.docx'
pdf_to_word_with_ocr(pdf_file, word_file)